3. Train-Predict-LogLoss

Tensorboard


In [1]:
import time
import os
import pandas as pd

project_name = 'Dog_Breed_Identification'
step_name = 'Train'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

cwd = os.getcwd()
model_path = os.path.join(cwd, 'model')
print('model_path: ' + model_path)


run_name: Dog_Breed_Identification_Train_20171024_231140
model_path: E:\Kaggle\Dog_Breed_Identification\model

In [2]:
import h5py
import numpy as np
from sklearn.utils import shuffle
np.random.seed(2017)

x_train = []
y_train = {}
x_val = []
y_val = {}
x_test = []

cwd = os.getcwd()
feature_cgg16 = os.path.join(cwd, 'model', 'feature_VGG16_{}.h5'.format(171023))
feature_cgg19 = os.path.join(cwd, 'model', 'feature_VGG19_{}.h5'.format(171023))
feature_resnet50 = os.path.join(cwd, 'model', 'feature_ResNet50_{}.h5'.format(171023))
feature_xception = os.path.join(cwd, 'model', 'feature_Xception_{}.h5'.format(171023))
feature_inception = os.path.join(cwd, 'model', 'feature_InceptionV3_{}.h5'.format(171023))
for filename in [feature_cgg16, feature_cgg19, feature_resnet50, feature_xception, feature_inception]:
    with h5py.File(filename, 'r') as h:
        x_train.append(np.array(h['train']))
        y_train = np.array(h['train_label'])
        x_val.append(np.array(h['val']))
        y_val = np.array(h['val_label'])
        x_test.append(np.array(h['test']))

# print(x_train[0].shape)
x_train = np.concatenate(x_train, axis=-1)
# y_train = np.concatenate(y_train, axis=0)
x_val = np.concatenate(x_val, axis=-1)
# y_val = np.concatenate(y_val, axis=0)
x_test = np.concatenate(x_test, axis=-1)
print(x_train.shape)
print(x_train.shape[1:])

print(len(y_train))
print(x_val.shape)
print(len(y_val))
print(x_test.shape)


(9710, 7168)
(7168,)
9710
(512, 7168)
512
(10357, 7168)

In [3]:
from sklearn.utils import shuffle
(x_train, y_train) = shuffle(x_train, y_train)

In [4]:
# from keras.utils.np_utils import to_categorical

# y_train = to_categorical(y_train)
# y_val = to_categorical(y_val)
# print(y_train.shape)
# print(y_val.shape)

In [5]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=2017)
logreg.fit(x_train, y_train)


Out[5]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=2017, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [6]:
val_proba = logreg.predict_proba(x_val)
val_preds = logreg.predict(x_val)
print(val_proba.shape)
print(val_preds.shape)
print(val_proba[:,1].shape)
print(y_val.shape)


(512, 120)
(512,)
(512,)
(512,)

In [7]:
from keras.utils.np_utils import to_categorical

print(val_proba[0])
print(y_val[0])

log_loss_y_val = to_categorical(y_val)
print(log_loss_y_val[0])


Using TensorFlow backend.
[  7.45460695e-01   2.84293267e-13   7.99581004e-07   6.30760697e-10
   2.03445622e-10   5.68373631e-14   3.84237447e-06   3.01967760e-15
   1.01069262e-15   4.74401280e-18   1.10405996e-16   8.27518972e-10
   2.50406755e-12   1.08462627e-11   2.31789401e-12   3.86012425e-14
   8.37836590e-07   1.85452544e-07   2.02762325e-10   4.54344195e-16
   7.71978296e-02   7.69646131e-14   2.01320491e-07   1.08736564e-04
   4.40551336e-11   3.29920291e-16   1.05499083e-03   2.14203865e-12
   5.98527877e-13   2.45454789e-10   3.09715586e-10   5.04921363e-11
   1.67381259e-04   8.04746097e-08   3.47277626e-13   1.05895393e-03
   1.06922141e-08   7.86777409e-14   7.32534644e-13   1.12683736e-19
   2.10097142e-08   6.55693483e-11   9.85671500e-16   3.37071171e-12
   1.47211587e-09   2.05518572e-11   3.16100207e-11   1.12251105e-16
   1.52625284e-01   5.55334659e-10   1.64261477e-05   2.17019171e-12
   1.29496133e-13   8.67372791e-15   1.23480327e-06   5.50844833e-13
   7.19406556e-08   3.63438591e-08   1.00241108e-11   3.48586727e-09
   9.53883090e-13   6.93033969e-14   1.95808134e-07   6.95825987e-12
   1.13588236e-10   2.63713937e-14   6.23287163e-15   1.14037006e-12
   1.70788063e-06   3.01040874e-06   3.40917938e-07   3.50069290e-13
   1.26117784e-15   8.82925419e-13   3.42393458e-14   1.05821855e-13
   4.12964397e-11   1.99998060e-06   1.18094318e-04   7.69080843e-06
   5.22391907e-12   1.41713495e-06   6.31032603e-10   5.36622882e-08
   3.32294603e-10   4.84957310e-09   1.14176443e-13   3.85829240e-13
   5.06622070e-09   1.61990635e-18   4.98259264e-15   5.81044510e-12
   4.47086584e-13   5.30620852e-12   2.78634943e-17   6.63298446e-11
   2.15748300e-04   1.11175491e-09   1.96832798e-06   2.25927387e-08
   2.44025292e-08   1.09619661e-16   1.99965282e-02   2.46464102e-09
   8.35568274e-11   1.95063074e-11   1.81687298e-03   6.31478090e-05
   2.80427626e-06   6.94374335e-05   1.44285349e-08   1.08477230e-13
   5.41475761e-15   5.71005516e-20   8.49971658e-15   6.66803221e-11
   1.28067037e-07   1.00393553e-10   3.31866176e-07   8.18397472e-07]
0
[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

In [8]:
from sklearn.metrics import log_loss, accuracy_score
print('Val log_loss: {}'.format(log_loss(log_loss_y_val, val_proba)))
val_proba_limit = np.clip(log_loss_y_val, 0.005, 0.995)
print('Val log_loss: {}'.format(log_loss(log_loss_y_val, val_proba_limit)))
print('Val accuracy_score: {}'.format(accuracy_score(y_val, val_preds)))


Val log_loss: 2.0989835870136866
Val log_loss: 0.4687465580556839
Val accuracy_score: 0.650390625

Predict


In [9]:
# Used to load model directly and skip train
# import os
# from keras.models import load_model
# cwd = os.getcwd()
# model = load_model(os.path.join(cwd, 'model', 'Dog_Breed_Identification_Train_20171024_155154.h5'))

In [10]:
y_pred = logreg.predict_proba(x_test)
print(y_pred.shape)


(10357, 120)

In [11]:
print(y_pred[:10])
y_pred = np.clip(y_pred, 0.005, 0.995)
print(y_pred[:10])


[[  1.78144831e-15   1.40848716e-12   9.76840904e-16 ...,   3.42443039e-18
    7.94279730e-17   1.51666824e-15]
 [  1.98223709e-35   1.24450424e-54   2.09200125e-40 ...,   1.53264574e-46
    8.07442231e-39   1.66204655e-39]
 [  7.32774599e-09   2.46212414e-04   1.78339160e-10 ...,   2.63654998e-08
    3.99599238e-07   5.17673054e-06]
 ..., 
 [  1.11767178e-26   4.38152562e-35   2.45042291e-22 ...,   2.31267926e-28
    1.59669927e-21   9.52783844e-24]
 [  1.66447477e-04   1.03118257e-06   1.07335403e-07 ...,   2.21226441e-09
    1.67821244e-08   7.32713010e-06]
 [  7.66335755e-19   4.31930469e-18   2.62540770e-19 ...,   1.23493296e-24
    7.39194944e-18   1.43506157e-15]]
[[ 0.005  0.005  0.005 ...,  0.005  0.005  0.005]
 [ 0.005  0.005  0.005 ...,  0.005  0.005  0.005]
 [ 0.005  0.005  0.005 ...,  0.005  0.005  0.005]
 ..., 
 [ 0.005  0.005  0.005 ...,  0.005  0.005  0.005]
 [ 0.005  0.005  0.005 ...,  0.005  0.005  0.005]
 [ 0.005  0.005  0.005 ...,  0.005  0.005  0.005]]

In [12]:
files = os.listdir(os.path.join(cwd, 'input', 'data_test', 'test'))
print(files[:10])


['000621fb3cbb32d8935728e48679680e.jpg', '00102ee9d8eb90812350685311fe5890.jpg', '0012a730dfa437f5f3613fb75efcd4ce.jpg', '001510bc8570bbeee98c8d80c8a95ec1.jpg', '001a5f3114548acdefa3d4da05474c2e.jpg', '00225dcd3e4d2410dd53239f95c0352f.jpg', '002c2a3117c2193b4d26400ce431eebd.jpg', '002c58d413a521ae8d1a5daeb35fc803.jpg', '002f80396f1e3db687c5932d7978b196.jpg', '0036c6bcec6031be9e62a257b1c3c442.jpg']

In [13]:
cwd = os.getcwd()
df = pd.read_csv(os.path.join(cwd, 'input', 'labels.csv'))
print('lables amount: %d' %len(df))
df.head()


lables amount: 10222
Out[13]:
id breed
0 000bec180eb18c7604dcecc8fe0dba07 boston_bull
1 001513dfcb2ffafc82cccf4d8bbaba97 dingo
2 001cdf01b096e06d78e9e5112d419397 pekinese
3 00214f311d5d2247d5dfe4fe24b2303d bluetick
4 0021f9ceb3235effd7fcde7f7538ed62 golden_retriever

In [14]:
n = len(df)
breed = set(df['breed'])
n_class = len(breed)
class_to_num = dict(zip(breed, range(n_class)))
num_to_class = dict(zip(range(n_class), breed))
print(breed)


{'giant_schnauzer', 'siberian_husky', 'norwegian_elkhound', 'newfoundland', 'weimaraner', 'mexican_hairless', 'otterhound', 'norfolk_terrier', 'english_foxhound', 'pug', 'italian_greyhound', 'groenendael', 'leonberg', 'saint_bernard', 'old_english_sheepdog', 'malinois', 'standard_poodle', 'eskimo_dog', 'kerry_blue_terrier', 'pekinese', 'great_pyrenees', 'ibizan_hound', 'german_shepherd', 'saluki', 'afghan_hound', 'keeshond', 'standard_schnauzer', 'irish_water_spaniel', 'shetland_sheepdog', 'pomeranian', 'welsh_springer_spaniel', 'papillon', 'briard', 'irish_terrier', 'samoyed', 'schipperke', 'lakeland_terrier', 'african_hunting_dog', 'bluetick', 'whippet', 'miniature_pinscher', 'shih-tzu', 'tibetan_mastiff', 'dhole', 'black-and-tan_coonhound', 'affenpinscher', 'norwich_terrier', 'clumber', 'english_setter', 'irish_setter', 'scottish_deerhound', 'german_short-haired_pointer', 'greater_swiss_mountain_dog', 'malamute', 'irish_wolfhound', 'collie', 'dingo', 'dandie_dinmont', 'cocker_spaniel', 'airedale', 'pembroke', 'cairn', 'staffordshire_bullterrier', 'gordon_setter', 'soft-coated_wheaten_terrier', 'redbone', 'great_dane', 'yorkshire_terrier', 'boxer', 'komondor', 'doberman', 'rottweiler', 'kelpie', 'miniature_poodle', 'bloodhound', 'american_staffordshire_terrier', 'silky_terrier', 'maltese_dog', 'bedlington_terrier', 'flat-coated_retriever', 'appenzeller', 'basset', 'vizsla', 'bernese_mountain_dog', 'bouvier_des_flandres', 'bull_mastiff', 'japanese_spaniel', 'sealyham_terrier', 'basenji', 'chow', 'australian_terrier', 'boston_bull', 'chesapeake_bay_retriever', 'english_springer', 'blenheim_spaniel', 'brittany_spaniel', 'wire-haired_fox_terrier', 'sussex_spaniel', 'golden_retriever', 'west_highland_white_terrier', 'entlebucher', 'curly-coated_retriever', 'kuvasz', 'border_collie', 'miniature_schnauzer', 'lhasa', 'rhodesian_ridgeback', 'beagle', 'scotch_terrier', 'cardigan', 'brabancon_griffon', 'labrador_retriever', 'toy_poodle', 'walker_hound', 'french_bulldog', 'chihuahua', 'tibetan_terrier', 'toy_terrier', 'border_terrier', 'borzoi'}

In [15]:
df2 = pd.read_csv('.\\input\\sample_submission.csv')
n_test = len(df2)
print(df2.shape)


(10357, 121)

In [16]:
for i in range(0, 120):
    df2.iloc[:,[i+1]] = y_pred[:,i]
df2.to_csv('.\\output\\pred.csv', index=None)

In [ ]:


In [17]:
print('Done !')


Done !

In [ ]: